#!/usr/bin/python
#-*- coding: UTF-8

# Version of encoding where all values (casemapping text) are combined into
# a single array and VALUES_I is pointing to a offset in that array, VALUES_C
# is holding original Unicode codepoints for collision detection

import sys
import mph


def usage():
	print 'usage: %s [TAG] [BMP_ONLY]' % sys.argv[0]
	print
	print '  [TAG]      - tag to use to prefix hashtables'
	print '  [BMP_ONLY] - flag to indicate if set is BMP-only, 0 or 1 (false or true)'

if __name__ == '__main__':
	if len(sys.argv) < 3:
		usage()
		sys.exit(1)

	TAG = sys.argv[1]
	bmp_only = bool(int(sys.argv[2]))

	dict = {}
	combined = '\\x00'  # offset 0 is normally impossible, it is used for signaling collision

	for i, line in enumerate(sys.stdin):
		codepoint, replacement = line.split(' ')
		replacement = mph.format_replacement(replacement.strip().split(','))

		if not replacement:
			continue

		offset = len(combined) / 4  # formatted replacement is always "\xYY", i.e. 4 byte len
		combined += replacement + '\\x00'

		dict[codepoint] = (codepoint, offset)

	(G, V) = mph.create_minimal_perfect_hash(dict)
	assert(len(G) == len(V))

	mph.gen_header(TAG, G, combined)
	mph.gen_includes()
	mph.gen_G(TAG, G)
	mph.gen_values(TAG, G, V, bmp_only)
	mph.gen_combined(TAG, combined)
